#' ---
#' title: "Agenda Seeding: Electoral Data Prep"
#' date: "`r Sys.Date()`"
#' output: html_document
#' header-includes:
#'  - \usepackage{booktabs}
#'  - \usepackage{longtable}
#'  - \usepackage{array}
#'  - \usepackage{multirow}
#'  - \usepackage{wrapfig}
#'  - \usepackage{float}
#'  - \usepackage{colortbl}
#'  - \usepackage{pdflscape}
#'  - \usepackage{tabu}
#'  - \usepackage{threeparttable}
#'  - \usepackage{threeparttablex}
#'  - \usepackage[normalem]{ulem}
#'  - \usepackage{makecell}
#'  - \usepackage{dcolumn}
#'  - \usepackage{setspace}\doublespacing
#' ---


## ---- electoral_prep_spin_code, eval = FALSE, include = FALSE ----
# spin code to output Rmd / Rnw
# set output_format to "html_document" for html
# rmarkdown::render(input = here::here("code", "electoral_data_prep3.R"), output_format = "pdf_document", clean = TRUE)



## ---- electoral_load_packages, include = FALSE ----
# Loading necessary packages
library(stringr)
library(here)
library(janitor)
library(dplyr)
library(car)
library(fields)
library(purrr)
library(naniar)
library(tidyr)

library(simputation)

library(conflicted)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("here",   "here")


# load custom functions
fiver <- function(x) {
  str_pad(
    string = x,
    width  = 5,
    side   = "left",
    pad    = "0"
  )
}





## ---- electoral_data_load_scrub, include = FALSE ----

# Load state abbreviations from FIPS dataset and cleaning
st <- read.csv(here("data/icpsr_states_fips_census.csv"))

st <- st %>%
    clean_names() %>%
    rename(state = name, 
           st    = stateab) %>% 
    mutate(state = str_to_upper(state))

# Loading ICPSR voting data
load(here("data/ICPSR_08611/dvnDataFile_509482.RData"), verbose = TRUE)

electoraldata_raw <- x

# Merging voting data and 'st'
electoraldata_merged <- merge(electoraldata_raw, st, by.x = "V1", by.y = "stateicp")

# Creating subset of voting data with, county name, number, state, st, and 1932 to 1972 data
electoral_3272 <- electoraldata_merged %>%
  select(1:3, 760:763, 498:689)


# Creating column labels for 'electoral_3272'
cols <- c(
  "stcode", "COUNTY", "COUNTYID", "ST", "STATE", "fips_st", "STATECEN",

  "Y32PDEM", "Y32PREP", "Y32PSOC", "Y32POTH", "Y32PTTL", "Y32PTO", "Y32CDEM", "Y32CREP", "Y32CSOC", "Y32CFAR", "Y32COTH", "Y32CTTL", "Y32CTO", "Y34CDEM", "Y34CREP", "Y34CDEQ", "Y34CREQ", "Y34CSOC", "Y34CPRO", "Y34CFAR", "Y34COTH", "Y34CTTL", "Y34CTO",

  "Y36PDEM", "Y36PREP", "Y36PUNI", "Y36POTH", "Y36PTTL", "Y36PTO", "Y36CDEM", "Y36CREP", "Y36CDEQ", "Y36CREQ", "Y36CPRO", "Y36CFAR", "Y36COTH", "Y36CTTL", "Y36CTO", "Y38CDEM", "Y38CREP", "Y38CDEQ", "Y38CREQ", "Y38CSOC", "Y38CPRO", "Y38CFAR", "Y38COTH", "Y38CTTL", "Y38CTO",

  "Y40PDEM", "Y40PREP", "Y40PREQ", "Y40POTH", "Y40PTTL", "Y40PTO", "Y40CDEM", "Y40CREP", "Y40CPRO", "Y40CFAR", "Y40COTH", "Y40CTTL", "Y40CTO", "Y42CDEM", "Y42CREP", "Y42CREQ", "Y42CPRO", "Y42CFAR", "Y42COTH", "Y42CTTL", "Y42CTO",


  "Y44PDEM", "Y44PREP", "Y44PDEQ", "Y44POTH", "Y44PTTL", "Y44PTO", "Y44CDEM", "Y44CREP", "Y44CREQ", "Y44CPRO", "Y44COTH", "Y44CTTL", "Y44CTO", "Y46CDEM", "Y46CREP", "Y46CDEQ", "Y46COTH", "Y46CTTL", "Y46CTO",

  "Y48PDEM", "Y48PREP", "Y48PPRO", "Y48PSTR", "Y48POTH", "Y48PTTL", "Y48PTO", "Y48CDEM", "Y48CREP", "Y48CPRO", "Y48COTH", "Y48CTTL", "Y48CTO", "Y50CDEM", "Y50CREP", "Y50COTH", "Y50CTTL", "Y50CTO",

  "Y52PDEM", "Y52PREP", "Y52POTH", "Y52PTTL", "Y52PTO", "Y52CDEM", "Y52CREP", "Y52COTH", "Y52CTTL", "Y52CTO", "Y54CDEM", "Y54CREP", "Y54COTH", "Y54CTTL", "Y54CTO",

  "Y56PDEM", "Y56PREP", "Y56POTH", "Y56PTTL", "Y56PTO", "Y56CDEM", "Y56CREP", "Y56COTH", "Y56CTTL", "Y56CTO", "Y58CDEM", "Y58CREP", "Y58COTH", "Y58CTTL", "Y58CTO",

  "Y60PDEM", "Y60PREP", "Y60POTH", "Y60PTTL", "Y60PTO", "Y60CDEM", "Y60CREP", "Y60COTH", "Y60CTTL", "Y60CTO", "Y62CDEM", "Y62CREP", "Y62COTH", "Y62CTTL", "Y62CTO",

  "Y64PDEM", "Y64PREP", "Y64POTH", "Y64PTTL", "Y64PTO", "Y64CDEM", "Y64CREP", "Y64COTH", "Y64CTTL", "Y64CTO", "Y66CDEM", "Y66CREP", "Y66COTH", "Y66CTTL", "Y66CTO",

  "Y68PDEM", "Y68PREP", "Y68PWALL", "Y68POTH", "Y68PTTL", "Y68PTO", "Y68CDEM", "Y68CREP", "Y68COTH", "Y68CTTL", "Y68CTO", "Y70CDEM", "Y70CREP", "Y70COTH", "Y70CTTL", "Y70CTO",

  "Y72PDEM", "Y72PREP", "Y72POTH", "Y72PTTL", "Y72PTO", "Y72CDEM", "Y72CREP", "Y72COTH", "Y72CTTL", "Y72CTO"
)

# Naming columns in 'electoral_3272' dataset, and cleaning them
colnames(electoral_3272) <- cols

electoral_3272 <- electoral_3272 %>% clean_names()

# drop 9999 countyids
electoral_3272 <- electoral_3272 %>%
  filter(countyid != 9999)


# Removing slash and dated county names following slash
#  (code removes slash and splits name in two, and then ditches the second name)
slash <- grep("/", electoral_3272$county)

splitmatrix <- electoral_3272$county[slash] %>% 
    strsplit("/") %>% 
    unlist() %>% 
    matrix(ncol = 2, byrow = TRUE)

electoral_3272$county[slash] <- splitmatrix[, 1]

# Converting all ST to SAINT
# Fixing shortening of one county
# Adding the word " COUNTY" to every entry in column 'county'
electoral_3272$county <- electoral_3272$county %>%
  str_replace_all("^ST ", "SAINT ") %>%
  str_replace_all("ST JOHN THE BAPTI", "ST JOHN THE BAPTIST") %>%
  paste(., "COUNTY")


# clean / cut four digit countyids. 
# Units with 4 digit countyids have almost no voting data
# and appear to be early units that later merge or go away or are reservations

electoral_3272 <- electoral_3272 %>%
  filter(str_detect(countyid, "0$"))

# drop trailing 0 by dividing all FIPS county codes by 10
electoral_3272$countyid <- electoral_3272$countyid / 10

# Creating variable called 'fips' by combining st + cty
electoral_3272$fips_st <- electoral_3272$fips_st %>% 
  str_pad(string = .,
    width = 2,
    side = "left",
    pad = "0"
  )

electoral_3272$fips_cty <- electoral_3272$countyid %>% 
  str_pad(string = .,
          width = 3,
          side = "left",
          pad = "0"
  )

electoral_3272$fips <- paste0(electoral_3272$fips_st, electoral_3272$fips_cty)

# drop AK and HI
electoral_3272 <- electoral_3272 %>% 
    filter(!state %in% c("ALASKA", "HAWAII"))

dim(electoral_3272) # 3156

# Saving dataset from 1932-1972
write.csv(electoral_3272, file = here("data/electoral_3272.csv"), row.names = FALSE)



## ---- geocoded_electoral_data, include = FALSE ----

# Loading dataset with geocodes
eg2 <- readr::read_csv(file = here("data/county_st_geocoded.csv"))

# Drop AK and HI
eg2 <- eg2 %>% filter(!state %in% c("ALASKA", "HAWAII"))

## Correct Geo Coding of LA
eg2[eg2$county == "LOS ANGELES", c("bg_lat", "bg_long")] <- c(34.05223, -118.2437)

dim(eg2) # 3100, ~22 counties lost from pre-geocode

# merge electoral data and geocodes
e2 <- left_join(
    electoral_3272,
    eg2 %>% select(fips, bg_lat, bg_long),
    by = c("fips")
)

# re-order columns, rename stcode
e2 <- e2 %>% 
    select(fips, county, st, fips_st, fips_cty, everything()) %>% 
    rename(stateicp = stcode)



## ---- clean_electoral, include = FALSE ----

##Identify columns for votes in '30s, '40s, '70s in which data is recorded with three digits. Identify columns for votes in '50s, '60s in which data is recorded with four digits

threedigits <- grep("^y3.|^y4.|^y7.", names(e2))
fourdigits  <- grep("^y5.|^y6.",      names(e2))
ttl.to      <- grep("ttl$|to$",       names(e2))
ttl         <- grep("ttl$",           names(e2))
to          <- grep("to$",            names(e2))

## Remove the columns with vote totals and turn out from the adjustments
threedigits <- setdiff(threedigits, ttl.to)
fourdigits  <- setdiff(fourdigits,  ttl.to)

## Turn both three digit and four digit columns into two digit percentages plus some decimal
e2[, threedigits] <- e2[, threedigits] * 0.10
e2[, fourdigits]  <- e2[, fourdigits]  * 0.01
e2[, to]          <- e2[, to]          * 0.10


## Convert 9999s to NA
e2 <- e2 %>% 
    mutate_if(
        is.numeric, 
        .funs = function(x)
        ifelse(x=="999.9" | x=="999.99" | x=="99.99" | x=="9999999", NA, x)
        )


## ---- subset_data_to_essential_years, include = FALSE ---- 

# drop y32 to y46, and all congressional
e3 <- e2 %>% 
    select(fips, county, st, fips_st, fips_cty, 
           stateicp, state, statecen, bg_lat, bg_long,
        matches("y44p|y48p|y52p|y56p|y60p|y64p|y68p|y72p")) 


e4 <- e3 %>% filter(is.na(bg_lat) == FALSE)

# check misssingness
sum(is.na(e4$y60pdem))
sum(is.na(e4$y64pdem))
sum(is.na(e4$y68pdem))
sum(is.na(e4$y72pdem))

#e4[which(is.na(e4$y60pdem)), ] %>% View()

miss60 <- which(is.na(e4$y60pdem))
miss72 <- which(is.na(e4$y72pdem))

## impute_missingness_via_em_algorithm, include = FALSE
e5 <- simputation::impute_em(e4, y60pdem ~ y48pdem + y52pdem + y56pdem + 
                                 y64pdem + y68pdem + y72pdem | fips_st)

e5 <- simputation::impute_em(e5, y72pdem ~ y48pdem + y52pdem + y56pdem + y60pdem +
                                 y64pdem + y68pdem | fips_st)

e5[miss60,] %>% head()
e5[miss72,] %>% head()

# check misssingness
sum(is.na(e5$y60pdem))
sum(is.na(e5$y64pdem))
sum(is.na(e5$y68pdem))
sum(is.na(e5$y72pdem))

#e5 <- e4 %>% filter(is.na(y68pdem) == FALSE)

dim(e5) # 3100

    

## ---- clean_mississippi_and_wallace_in_1968, include = FALSE ----

## Correct Mississippi entries in 1968 by multiplying by 10
e5 <- e5 %>% 
    mutate(y68pdem = ifelse(st == "MS", y68pdem * 10, y68pdem),
           y68prep = ifelse(st == "MS", y68prep * 10, y68prep),
           y68poth = ifelse(st == "MS", y68poth * 10, y68poth)  
           )

## Correct All ST Wallace entries in 1968 by multiplying by 10
e5$y68pwall <- e5$y68pwall * 10


dim(e5) # 3100


## ---- create_panel_version_of_electoral_data, include = FALSE ----

e5$y48plag <- e5$y44pdem
e5$y52plag <- e5$y48pdem
e5$y56plag <- e5$y52pdem
e5$y60plag <- e5$y56pdem

e5$y64plag <- e5$y60pdem
e5$y68plag <- e5$y64pdem
e5$y72plag <- e5$y68pdem


e6 <- e5 %>% pivot_longer(
    cols      = -c(fips, county, st, fips_st, fips_cty, stateicp, 
              state, statecen, bg_lat, bg_long),
    names_to  = "variable",
    values_to = "value"
    )


# extract y, two digits, text
split_variable_col <- str_match(e6$variable, "(y)(\\d\\d)([[:alpha:]]+)") %>% 
    as.data.frame() 

# rename cols
names(split_variable_col) <- c("variable_full", "y", "year", "variable")

e7 <- bind_cols(e6 %>% select(-variable), 
                split_variable_col %>% select(year, variable) ) %>% 
      mutate(year = paste0("19", year) %>% as.numeric() )

e7 <- e7 %>% 
    # keep only some variables (DROPPED: pdeq, pstr, ppro)
    filter(variable %in% c("pdem", "prep", "poth", "pwall", "pto", "pttl", "plag") == TRUE)


# transform to wide format data
e8 <- pivot_wider(
    e7,
    id_cols = c(county, year, st, fips, fips_st, fips_cty, stateicp, state, statecen, bg_lat, bg_long ),
    names_from =  variable,
    values_from = value
)


## ---- save_electoral_panel ----

save(e8, file = here("data/electoral_4472_geocoded_panel.Rdata"))
#write_csv(x = e8, path = here("data/electoral_4472_geocoded_panel.csv"), na = "")

## ---- create_electoral_codebook, eval = FALSE ----

dataMaid::makeCodebook(
    e8,
    file = here("codebooks/codebook_e8.Rmd"),
    reportTitle = "Codebook for Electoral Data, 1944-1972",
    checks = list(character = NULL, factor = NULL), # suppress chr/fct output
    replace = TRUE
)
